{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Constructing characteristic clusters for \"Factor Momentum\"\n",
    "\n",
    "- Take annual CRSP/Compustat data with relevant characteristics as the input\n",
    "  - The input file comes from Stata as a csv file\n",
    "- Transforms characteristics as required (e.g., z-scores)\n",
    "- Assign stocks into N clusters with algorithm f\n",
    "  - Do this for different numbers of clusters and algorithms\n",
    "- Output clusters into a csv file for reading back to Stata"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "from sklearn.cluster import KMeans\n",
    "from sklearn.cluster import AgglomerativeClustering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--all months (full)--\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>marketbeta</th>\n",
       "      <th>idiosyncraticvol</th>\n",
       "      <th>ltrev</th>\n",
       "      <th>size</th>\n",
       "      <th>qmj</th>\n",
       "      <th>assetgrowth</th>\n",
       "      <th>booktomarket</th>\n",
       "      <th>ffprofitability</th>\n",
       "      <th>exchcd</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yyyymm</th>\n",
       "      <th>permno</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>198806</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.090655</td>\n",
       "      <td>0.575131</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.2</td>\n",
       "      <td>-0.525444</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198807</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.086793</td>\n",
       "      <td>0.508968</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.2</td>\n",
       "      <td>-0.528242</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198808</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.079474</td>\n",
       "      <td>0.449557</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.2</td>\n",
       "      <td>-0.535590</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198809</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.072340</td>\n",
       "      <td>0.432542</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.2</td>\n",
       "      <td>-0.551552</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198810</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.073450</td>\n",
       "      <td>0.539960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.2</td>\n",
       "      <td>-0.541882</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "               marketbeta  idiosyncraticvol  ltrev  size       qmj  \\\n",
       "yyyymm permno                                                        \n",
       "198806 10001     0.090655          0.575131    NaN   6.2 -0.525444   \n",
       "198807 10001     0.086793          0.508968    NaN   6.2 -0.528242   \n",
       "198808 10001     0.079474          0.449557    NaN   6.2 -0.535590   \n",
       "198809 10001     0.072340          0.432542    NaN   6.2 -0.551552   \n",
       "198810 10001     0.073450          0.539960    NaN   6.2 -0.541882   \n",
       "\n",
       "               assetgrowth  booktomarket  ffprofitability  exchcd  \n",
       "yyyymm permno                                                      \n",
       "198806 10001     -0.038474      1.207618         0.110827       3  \n",
       "198807 10001     -0.038474      1.207618         0.110827       3  \n",
       "198808 10001     -0.038474      1.207618         0.110827       3  \n",
       "198809 10001     -0.038474      1.207618         0.110827       3  \n",
       "198810 10001     -0.038474      1.207618         0.110827       3  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(2374161, 9)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--June data (small)--\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>assetgrowth</th>\n",
       "      <th>booktomarket</th>\n",
       "      <th>ffprofitability</th>\n",
       "      <th>exchcd</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yyyymm</th>\n",
       "      <th>permno</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>198806</th>\n",
       "      <th>10001</th>\n",
       "      <td>6.20000</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198906</th>\n",
       "      <th>10001</th>\n",
       "      <td>7.00700</td>\n",
       "      <td>-0.003058</td>\n",
       "      <td>1.145192</td>\n",
       "      <td>0.152484</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199006</th>\n",
       "      <th>10001</th>\n",
       "      <td>10.05225</td>\n",
       "      <td>0.582020</td>\n",
       "      <td>0.818149</td>\n",
       "      <td>0.283841</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199106</th>\n",
       "      <th>10001</th>\n",
       "      <td>11.26650</td>\n",
       "      <td>0.017021</td>\n",
       "      <td>0.942575</td>\n",
       "      <td>0.263297</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>199206</th>\n",
       "      <th>10001</th>\n",
       "      <td>12.63125</td>\n",
       "      <td>0.038028</td>\n",
       "      <td>0.667907</td>\n",
       "      <td>0.229853</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                   size  assetgrowth  booktomarket  ffprofitability  exchcd\n",
       "yyyymm permno                                                              \n",
       "198806 10001    6.20000    -0.038474      1.207618         0.110827       3\n",
       "198906 10001    7.00700    -0.003058      1.145192         0.152484       3\n",
       "199006 10001   10.05225     0.582020      0.818149         0.283841       3\n",
       "199106 10001   11.26650     0.017021      0.942575         0.263297       3\n",
       "199206 10001   12.63125     0.038028      0.667907         0.229853       3"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "(206540, 5)"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "First and last date:  196306 202111\n"
     ]
    }
   ],
   "source": [
    "df = pd.read_csv('~/Dropbox/Research/Factor Momentum/Python/chardata.csv')\n",
    "\n",
    "df_full = df.set_index(['yyyymm', 'permno'])\n",
    "df_small = df[df['yyyymm'] % 100 ==6].set_index(['yyyymm', 'permno'])\n",
    "\n",
    "# select subset of columns for small\n",
    "df_small = df_small.drop(columns=['marketbeta', 'idiosyncraticvol', 'ltrev', 'qmj'])\n",
    "\n",
    "print('--all months (full)--')\n",
    "display(df_full.head(5))\n",
    "display(df_full.shape)\n",
    "print('--June data (small)--')\n",
    "display(df_small.head(5))\n",
    "display(df_small.shape)\n",
    "print('\\nFirst and last date: ', df_full.index.get_level_values(level=0).min(), df_full.index.get_level_values(level=0).max())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Define characteristics and convert to *z*-scores and NYSE-based percentile ranks in each cross section\n",
    "\n",
    "NYSE-based percentile ranks are computed by ranking all NYSE stocks first from 1/N to N/(N+1), converting these ranks into percentile values, and then assigning non-NYSE stocks percentile ranks via linear interpolation "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'marketbeta'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'idiosyncraticvol'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'ltrev'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'size'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'qmj'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'assetgrowth'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'booktomarket'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'ffprofitability'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'size'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'assetgrowth'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'booktomarket'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "'ffprofitability'"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "for df in [df_full, df_small]:\n",
    "\n",
    "    characteristics = list(df.columns)\n",
    "    characteristics.remove('exchcd')\n",
    "    \n",
    "    for c in characteristics:        \n",
    "    \n",
    "        display(c)\n",
    "\n",
    "        data = df[['exchcd', c]].dropna()\n",
    "\n",
    "        # nyse-based percentile ranks\n",
    "\n",
    "        ranks = data[data['exchcd']==1][c].groupby(level=0).rank()\n",
    "        p = ranks / (ranks.groupby(level=0).max() + 1)\n",
    "\n",
    "        data['p'] = p\n",
    "\n",
    "        # 1st and last zero and one if needed\n",
    "\n",
    "        min_idx = data[c].groupby(level=0).idxmin()\n",
    "        max_idx = data[c].groupby(level=0).idxmax()\n",
    "\n",
    "        data.loc[min_idx, 'p'] = data.loc[min_idx, 'p'].replace({np.nan: 0})\n",
    "        data.loc[max_idx, 'p'] = data.loc[max_idx, 'p'].replace({np.nan: 1})\n",
    "\n",
    "        data = data.sort_values(by=c)\n",
    "\n",
    "        data['p2'] = data['p'].interpolate()\n",
    "\n",
    "        plabel = \"p\" + str(characteristics.index(c))\n",
    "        df[plabel] = data['p2']\n",
    "\n",
    "        # z-scores\n",
    "        #z = (data[c] - data[c].groupby(level=0).mean()) / data[c].groupby(level=0).std()\n",
    "\n",
    "        #clabel = \"c\" + str(characteristics.index(c))\n",
    "        #df[clabel] = z \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>assetgrowth</th>\n",
       "      <th>booktomarket</th>\n",
       "      <th>ffprofitability</th>\n",
       "      <th>exchcd</th>\n",
       "      <th>p0</th>\n",
       "      <th>p1</th>\n",
       "      <th>p2</th>\n",
       "      <th>p3</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>permno</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>38789</th>\n",
       "      <td>0.593125</td>\n",
       "      <td>-0.086207</td>\n",
       "      <td>2.818630</td>\n",
       "      <td>-0.165889</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.052742</td>\n",
       "      <td>0.952540</td>\n",
       "      <td>0.017586</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29639</th>\n",
       "      <td>0.896250</td>\n",
       "      <td>-0.162162</td>\n",
       "      <td>1.671409</td>\n",
       "      <td>0.086782</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.044891</td>\n",
       "      <td>0.932757</td>\n",
       "      <td>0.088467</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30568</th>\n",
       "      <td>1.264375</td>\n",
       "      <td>-0.085106</td>\n",
       "      <td>1.259664</td>\n",
       "      <td>0.220147</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000604</td>\n",
       "      <td>0.093094</td>\n",
       "      <td>0.886780</td>\n",
       "      <td>0.461022</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32117</th>\n",
       "      <td>1.354500</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.516796</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.143552</td>\n",
       "      <td>0.472956</td>\n",
       "      <td>0.334537</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31130</th>\n",
       "      <td>1.464375</td>\n",
       "      <td>0.285714</td>\n",
       "      <td>0.692446</td>\n",
       "      <td>0.187377</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.939071</td>\n",
       "      <td>0.734057</td>\n",
       "      <td>0.406594</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11703</th>\n",
       "      <td>10834.178000</td>\n",
       "      <td>-0.122394</td>\n",
       "      <td>0.165151</td>\n",
       "      <td>0.495604</td>\n",
       "      <td>1</td>\n",
       "      <td>0.993515</td>\n",
       "      <td>0.010376</td>\n",
       "      <td>0.024643</td>\n",
       "      <td>0.950713</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12490</th>\n",
       "      <td>16051.984000</td>\n",
       "      <td>0.393998</td>\n",
       "      <td>0.159031</td>\n",
       "      <td>0.585454</td>\n",
       "      <td>1</td>\n",
       "      <td>0.994812</td>\n",
       "      <td>0.962387</td>\n",
       "      <td>0.020752</td>\n",
       "      <td>0.975357</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11850</th>\n",
       "      <td>16878.682000</td>\n",
       "      <td>0.041086</td>\n",
       "      <td>0.429970</td>\n",
       "      <td>0.243114</td>\n",
       "      <td>1</td>\n",
       "      <td>0.996109</td>\n",
       "      <td>0.333333</td>\n",
       "      <td>0.276265</td>\n",
       "      <td>0.451362</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12079</th>\n",
       "      <td>27270.721000</td>\n",
       "      <td>0.042689</td>\n",
       "      <td>0.263149</td>\n",
       "      <td>0.490237</td>\n",
       "      <td>1</td>\n",
       "      <td>0.997406</td>\n",
       "      <td>0.346304</td>\n",
       "      <td>0.079118</td>\n",
       "      <td>0.948119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10401</th>\n",
       "      <td>35196.160000</td>\n",
       "      <td>0.093073</td>\n",
       "      <td>0.528973</td>\n",
       "      <td>0.231543</td>\n",
       "      <td>1</td>\n",
       "      <td>0.998703</td>\n",
       "      <td>0.636835</td>\n",
       "      <td>0.399481</td>\n",
       "      <td>0.394293</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>992 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                size  assetgrowth  booktomarket  ffprofitability  exchcd  \\\n",
       "permno                                                                     \n",
       "38789       0.593125    -0.086207      2.818630        -0.165889       2   \n",
       "29639       0.896250    -0.162162      1.671409         0.086782       2   \n",
       "30568       1.264375    -0.085106      1.259664         0.220147       2   \n",
       "32117       1.354500     0.000000      0.516796         0.200000       2   \n",
       "31130       1.464375     0.285714      0.692446         0.187377       2   \n",
       "...              ...          ...           ...              ...     ...   \n",
       "11703   10834.178000    -0.122394      0.165151         0.495604       1   \n",
       "12490   16051.984000     0.393998      0.159031         0.585454       1   \n",
       "11850   16878.682000     0.041086      0.429970         0.243114       1   \n",
       "12079   27270.721000     0.042689      0.263149         0.490237       1   \n",
       "10401   35196.160000     0.093073      0.528973         0.231543       1   \n",
       "\n",
       "              p0        p1        p2        p3  \n",
       "permno                                          \n",
       "38789   0.000000  0.052742  0.952540  0.017586  \n",
       "29639   0.000000  0.044891  0.932757  0.088467  \n",
       "30568   0.000604  0.093094  0.886780  0.461022  \n",
       "32117   0.000000  0.143552  0.472956  0.334537  \n",
       "31130   0.000000  0.939071  0.734057  0.406594  \n",
       "...          ...       ...       ...       ...  \n",
       "11703   0.993515  0.010376  0.024643  0.950713  \n",
       "12490   0.994812  0.962387  0.020752  0.975357  \n",
       "11850   0.996109  0.333333  0.276265  0.451362  \n",
       "12079   0.997406  0.346304  0.079118  0.948119  \n",
       "10401   0.998703  0.636835  0.399481  0.394293  \n",
       "\n",
       "[992 rows x 9 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_small.xs(196506, level=0).sort_values(by='size')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Assign stocks into clusters using k-means and Ward (agglomerative clustering)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "196506"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197006"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197506"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198006"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198506"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199006"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199506"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200006"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200506"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201006"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201506"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "202006"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "num_clusters = [10, 25, 50, 100]\n",
    "\n",
    "# for the annual characteristics, only loop through Junes\n",
    "charlist = ['p0', 'p1', 'p2', 'p3']\n",
    "\n",
    "date_list = list(df_small.index.get_level_values(level=0).drop_duplicates())\n",
    "date_list.sort()\n",
    "\n",
    "for d in date_list:\n",
    "    \n",
    "    if (d // 100) % 5 == 0:\n",
    "        display(d)\n",
    "    \n",
    "    data = df_small.xs(d, level=0).dropna(subset=charlist)\n",
    "    \n",
    "    for k in num_clusters:\n",
    "        \n",
    "        kmeans = KMeans(n_clusters=k, init='k-means++', n_init=100, tol=1e-6, verbose=0).fit(data[charlist])\n",
    "\n",
    "        AggClusters = AgglomerativeClustering(n_clusters=k).fit(data[charlist])\n",
    "\n",
    "        df_small.loc[(d, data.index), 'kmeans' + str(k)] = kmeans.labels_\n",
    "        \n",
    "        df_small.loc[(d, data.index), 'ward' + str(k)] = AggClusters.labels_       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>size</th>\n",
       "      <th>assetgrowth</th>\n",
       "      <th>booktomarket</th>\n",
       "      <th>ffprofitability</th>\n",
       "      <th>exchcd</th>\n",
       "      <th>p0</th>\n",
       "      <th>p1</th>\n",
       "      <th>p2</th>\n",
       "      <th>p3</th>\n",
       "      <th>kmeans10</th>\n",
       "      <th>ward10</th>\n",
       "      <th>kmeans25</th>\n",
       "      <th>ward25</th>\n",
       "      <th>kmeans50</th>\n",
       "      <th>ward50</th>\n",
       "      <th>kmeans100</th>\n",
       "      <th>ward100</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>permno</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10006</th>\n",
       "      <td>140.759370</td>\n",
       "      <td>0.459617</td>\n",
       "      <td>1.199740</td>\n",
       "      <td>0.160064</td>\n",
       "      <td>1</td>\n",
       "      <td>0.511424</td>\n",
       "      <td>0.984183</td>\n",
       "      <td>0.750439</td>\n",
       "      <td>0.205624</td>\n",
       "      <td>2.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>36.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>36.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10014</th>\n",
       "      <td>11.117250</td>\n",
       "      <td>0.424276</td>\n",
       "      <td>1.084354</td>\n",
       "      <td>0.160643</td>\n",
       "      <td>1</td>\n",
       "      <td>0.057996</td>\n",
       "      <td>0.980668</td>\n",
       "      <td>0.683656</td>\n",
       "      <td>0.209139</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>11.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>5.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10102</th>\n",
       "      <td>269.448760</td>\n",
       "      <td>0.142290</td>\n",
       "      <td>0.736371</td>\n",
       "      <td>0.234265</td>\n",
       "      <td>1</td>\n",
       "      <td>0.655536</td>\n",
       "      <td>0.854130</td>\n",
       "      <td>0.495606</td>\n",
       "      <td>0.488576</td>\n",
       "      <td>7.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>88.0</td>\n",
       "      <td>97.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10137</th>\n",
       "      <td>475.070740</td>\n",
       "      <td>0.060925</td>\n",
       "      <td>0.476324</td>\n",
       "      <td>0.341060</td>\n",
       "      <td>1</td>\n",
       "      <td>0.778559</td>\n",
       "      <td>0.585237</td>\n",
       "      <td>0.233743</td>\n",
       "      <td>0.820738</td>\n",
       "      <td>0.0</td>\n",
       "      <td>7.0</td>\n",
       "      <td>1.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>34.0</td>\n",
       "      <td>15.0</td>\n",
       "      <td>83.0</td>\n",
       "      <td>96.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10145</th>\n",
       "      <td>1283.601000</td>\n",
       "      <td>0.249240</td>\n",
       "      <td>0.544203</td>\n",
       "      <td>0.279307</td>\n",
       "      <td>1</td>\n",
       "      <td>0.934974</td>\n",
       "      <td>0.943761</td>\n",
       "      <td>0.314587</td>\n",
       "      <td>0.657293</td>\n",
       "      <td>0.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>21.0</td>\n",
       "      <td>52.0</td>\n",
       "      <td>39.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27721</th>\n",
       "      <td>21.756001</td>\n",
       "      <td>0.029536</td>\n",
       "      <td>0.846687</td>\n",
       "      <td>0.240819</td>\n",
       "      <td>1</td>\n",
       "      <td>0.137083</td>\n",
       "      <td>0.378735</td>\n",
       "      <td>0.581722</td>\n",
       "      <td>0.509666</td>\n",
       "      <td>9.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>12.0</td>\n",
       "      <td>35.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>18.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28054</th>\n",
       "      <td>34.692001</td>\n",
       "      <td>0.015385</td>\n",
       "      <td>0.668114</td>\n",
       "      <td>0.230043</td>\n",
       "      <td>1</td>\n",
       "      <td>0.214411</td>\n",
       "      <td>0.268893</td>\n",
       "      <td>0.428822</td>\n",
       "      <td>0.481547</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>42.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>40.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28142</th>\n",
       "      <td>26.349375</td>\n",
       "      <td>0.102662</td>\n",
       "      <td>1.581869</td>\n",
       "      <td>0.192690</td>\n",
       "      <td>1</td>\n",
       "      <td>0.168717</td>\n",
       "      <td>0.768014</td>\n",
       "      <td>0.876977</td>\n",
       "      <td>0.333919</td>\n",
       "      <td>8.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>67.0</td>\n",
       "      <td>29.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29735</th>\n",
       "      <td>3.470500</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>0.514376</td>\n",
       "      <td>0.132042</td>\n",
       "      <td>2</td>\n",
       "      <td>0.000753</td>\n",
       "      <td>0.363876</td>\n",
       "      <td>0.118174</td>\n",
       "      <td>0.099268</td>\n",
       "      <td>9.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>6.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>43.0</td>\n",
       "      <td>45.0</td>\n",
       "      <td>21.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53023</th>\n",
       "      <td>64.764000</td>\n",
       "      <td>-0.080585</td>\n",
       "      <td>1.769399</td>\n",
       "      <td>0.179813</td>\n",
       "      <td>1</td>\n",
       "      <td>0.344464</td>\n",
       "      <td>0.029877</td>\n",
       "      <td>0.912127</td>\n",
       "      <td>0.265378</td>\n",
       "      <td>3.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>32.0</td>\n",
       "      <td>2.0</td>\n",
       "      <td>29.0</td>\n",
       "      <td>67.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>570 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               size  assetgrowth  booktomarket  ffprofitability  exchcd  \\\n",
       "permno                                                                    \n",
       "10006    140.759370     0.459617      1.199740         0.160064       1   \n",
       "10014     11.117250     0.424276      1.084354         0.160643       1   \n",
       "10102    269.448760     0.142290      0.736371         0.234265       1   \n",
       "10137    475.070740     0.060925      0.476324         0.341060       1   \n",
       "10145   1283.601000     0.249240      0.544203         0.279307       1   \n",
       "...             ...          ...           ...              ...     ...   \n",
       "27721     21.756001     0.029536      0.846687         0.240819       1   \n",
       "28054     34.692001     0.015385      0.668114         0.230043       1   \n",
       "28142     26.349375     0.102662      1.581869         0.192690       1   \n",
       "29735      3.470500     0.083333      0.514376         0.132042       2   \n",
       "53023     64.764000    -0.080585      1.769399         0.179813       1   \n",
       "\n",
       "              p0        p1        p2        p3  kmeans10  ward10  kmeans25  \\\n",
       "permno                                                                       \n",
       "10006   0.511424  0.984183  0.750439  0.205624       2.0     6.0       2.0   \n",
       "10014   0.057996  0.980668  0.683656  0.209139       8.0     2.0      10.0   \n",
       "10102   0.655536  0.854130  0.495606  0.488576       7.0     6.0       2.0   \n",
       "10137   0.778559  0.585237  0.233743  0.820738       0.0     7.0       1.0   \n",
       "10145   0.934974  0.943761  0.314587  0.657293       0.0     9.0      19.0   \n",
       "...          ...       ...       ...       ...       ...     ...       ...   \n",
       "27721   0.137083  0.378735  0.581722  0.509666       9.0     0.0      18.0   \n",
       "28054   0.214411  0.268893  0.428822  0.481547       9.0     3.0      18.0   \n",
       "28142   0.168717  0.768014  0.876977  0.333919       8.0     2.0      10.0   \n",
       "29735   0.000753  0.363876  0.118174  0.099268       9.0     3.0       9.0   \n",
       "53023   0.344464  0.029877  0.912127  0.265378       3.0     5.0       0.0   \n",
       "\n",
       "        ward25  kmeans50  ward50  kmeans100  ward100  \n",
       "permno                                                \n",
       "10006      1.0      18.0    36.0        3.0     36.0  \n",
       "10014     11.0      25.0     3.0       35.0      5.0  \n",
       "10102      0.0       5.0    25.0       88.0     97.0  \n",
       "10137      2.0      34.0    15.0       83.0     96.0  \n",
       "10145      4.0      30.0    21.0       52.0     39.0  \n",
       "...        ...       ...     ...        ...      ...  \n",
       "27721     12.0      35.0    19.0       96.0     18.0  \n",
       "28054      6.0      42.0    32.0       22.0     40.0  \n",
       "28142     10.0      10.0     9.0       67.0     29.0  \n",
       "29735      6.0      17.0    43.0       45.0     21.0  \n",
       "53023     24.0      32.0     2.0       29.0     67.0  \n",
       "\n",
       "[570 rows x 17 columns]"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_small.xs(196306,level=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_small[['kmeans10', 'kmeans25', 'kmeans50', 'kmeans100', 'ward10', 'ward25', 'ward50', 'ward100']].to_csv('~/Dropbox/Research/Factor Momentum/Python/clusters_small.csv')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Monthly data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th>marketbeta</th>\n",
       "      <th>idiosyncraticvol</th>\n",
       "      <th>ltrev</th>\n",
       "      <th>size</th>\n",
       "      <th>qmj</th>\n",
       "      <th>assetgrowth</th>\n",
       "      <th>booktomarket</th>\n",
       "      <th>ffprofitability</th>\n",
       "      <th>exchcd</th>\n",
       "      <th>p0</th>\n",
       "      <th>p1</th>\n",
       "      <th>p2</th>\n",
       "      <th>p3</th>\n",
       "      <th>p4</th>\n",
       "      <th>p5</th>\n",
       "      <th>p6</th>\n",
       "      <th>p7</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>yyyymm</th>\n",
       "      <th>permno</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>198806</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.090655</td>\n",
       "      <td>0.575131</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.20</td>\n",
       "      <td>-0.525444</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "      <td>0.022663</td>\n",
       "      <td>0.518488</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.003213</td>\n",
       "      <td>0.421619</td>\n",
       "      <td>0.127090</td>\n",
       "      <td>0.930636</td>\n",
       "      <td>0.160905</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198807</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.086793</td>\n",
       "      <td>0.508968</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.20</td>\n",
       "      <td>-0.528242</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "      <td>0.024137</td>\n",
       "      <td>0.917800</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.003194</td>\n",
       "      <td>0.424092</td>\n",
       "      <td>0.130391</td>\n",
       "      <td>0.920861</td>\n",
       "      <td>0.159606</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198808</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.079474</td>\n",
       "      <td>0.449557</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.20</td>\n",
       "      <td>-0.535590</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "      <td>0.020929</td>\n",
       "      <td>0.906290</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.003106</td>\n",
       "      <td>0.432743</td>\n",
       "      <td>0.141944</td>\n",
       "      <td>0.911085</td>\n",
       "      <td>0.158307</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198809</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.072340</td>\n",
       "      <td>0.432542</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.20</td>\n",
       "      <td>-0.551552</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "      <td>0.013954</td>\n",
       "      <td>0.676218</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.003203</td>\n",
       "      <td>0.452533</td>\n",
       "      <td>0.140293</td>\n",
       "      <td>0.901310</td>\n",
       "      <td>0.157008</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>198810</th>\n",
       "      <th>10001</th>\n",
       "      <td>0.073450</td>\n",
       "      <td>0.539960</td>\n",
       "      <td>NaN</td>\n",
       "      <td>6.20</td>\n",
       "      <td>-0.541882</td>\n",
       "      <td>-0.038474</td>\n",
       "      <td>1.207618</td>\n",
       "      <td>0.110827</td>\n",
       "      <td>3</td>\n",
       "      <td>0.048190</td>\n",
       "      <td>0.975014</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0.003184</td>\n",
       "      <td>0.415930</td>\n",
       "      <td>0.128740</td>\n",
       "      <td>0.891534</td>\n",
       "      <td>0.155709</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202107</th>\n",
       "      <th>93436</th>\n",
       "      <td>1.939772</td>\n",
       "      <td>0.305867</td>\n",
       "      <td>5.093786</td>\n",
       "      <td>668826.88</td>\n",
       "      <td>-0.393244</td>\n",
       "      <td>0.519951</td>\n",
       "      <td>0.032812</td>\n",
       "      <td>0.171654</td>\n",
       "      <td>3</td>\n",
       "      <td>0.920721</td>\n",
       "      <td>0.706851</td>\n",
       "      <td>0.965588</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.461973</td>\n",
       "      <td>0.926307</td>\n",
       "      <td>0.012791</td>\n",
       "      <td>0.269733</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202108</th>\n",
       "      <th>93436</th>\n",
       "      <td>1.926760</td>\n",
       "      <td>0.289957</td>\n",
       "      <td>10.752275</td>\n",
       "      <td>668826.88</td>\n",
       "      <td>-0.403173</td>\n",
       "      <td>0.519951</td>\n",
       "      <td>0.032812</td>\n",
       "      <td>0.171654</td>\n",
       "      <td>3</td>\n",
       "      <td>0.915504</td>\n",
       "      <td>0.732214</td>\n",
       "      <td>0.997125</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.439507</td>\n",
       "      <td>0.926086</td>\n",
       "      <td>0.012636</td>\n",
       "      <td>0.263647</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202109</th>\n",
       "      <th>93436</th>\n",
       "      <td>1.872851</td>\n",
       "      <td>0.269849</td>\n",
       "      <td>9.513405</td>\n",
       "      <td>668826.88</td>\n",
       "      <td>-0.417473</td>\n",
       "      <td>0.519951</td>\n",
       "      <td>0.032812</td>\n",
       "      <td>0.171654</td>\n",
       "      <td>3</td>\n",
       "      <td>0.858287</td>\n",
       "      <td>0.642387</td>\n",
       "      <td>0.983239</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.433954</td>\n",
       "      <td>0.926528</td>\n",
       "      <td>0.012482</td>\n",
       "      <td>0.275819</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202110</th>\n",
       "      <th>93436</th>\n",
       "      <td>1.973794</td>\n",
       "      <td>0.338077</td>\n",
       "      <td>8.812370</td>\n",
       "      <td>668826.88</td>\n",
       "      <td>-0.439016</td>\n",
       "      <td>0.519951</td>\n",
       "      <td>0.032812</td>\n",
       "      <td>0.171654</td>\n",
       "      <td>3</td>\n",
       "      <td>0.820107</td>\n",
       "      <td>0.632387</td>\n",
       "      <td>0.992073</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.433453</td>\n",
       "      <td>0.925645</td>\n",
       "      <td>0.012945</td>\n",
       "      <td>0.266690</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>202111</th>\n",
       "      <th>93436</th>\n",
       "      <td>2.019042</td>\n",
       "      <td>0.481740</td>\n",
       "      <td>13.984159</td>\n",
       "      <td>668826.88</td>\n",
       "      <td>0.409665</td>\n",
       "      <td>0.519951</td>\n",
       "      <td>0.032812</td>\n",
       "      <td>0.171654</td>\n",
       "      <td>3</td>\n",
       "      <td>0.941812</td>\n",
       "      <td>0.942366</td>\n",
       "      <td>0.998905</td>\n",
       "      <td>1.000000</td>\n",
       "      <td>0.673471</td>\n",
       "      <td>0.925865</td>\n",
       "      <td>0.012173</td>\n",
       "      <td>0.260604</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2374161 rows × 17 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "               marketbeta  idiosyncraticvol      ltrev       size       qmj  \\\n",
       "yyyymm permno                                                                 \n",
       "198806 10001     0.090655          0.575131        NaN       6.20 -0.525444   \n",
       "198807 10001     0.086793          0.508968        NaN       6.20 -0.528242   \n",
       "198808 10001     0.079474          0.449557        NaN       6.20 -0.535590   \n",
       "198809 10001     0.072340          0.432542        NaN       6.20 -0.551552   \n",
       "198810 10001     0.073450          0.539960        NaN       6.20 -0.541882   \n",
       "...                   ...               ...        ...        ...       ...   \n",
       "202107 93436     1.939772          0.305867   5.093786  668826.88 -0.393244   \n",
       "202108 93436     1.926760          0.289957  10.752275  668826.88 -0.403173   \n",
       "202109 93436     1.872851          0.269849   9.513405  668826.88 -0.417473   \n",
       "202110 93436     1.973794          0.338077   8.812370  668826.88 -0.439016   \n",
       "202111 93436     2.019042          0.481740  13.984159  668826.88  0.409665   \n",
       "\n",
       "               assetgrowth  booktomarket  ffprofitability  exchcd        p0  \\\n",
       "yyyymm permno                                                                 \n",
       "198806 10001     -0.038474      1.207618         0.110827       3  0.022663   \n",
       "198807 10001     -0.038474      1.207618         0.110827       3  0.024137   \n",
       "198808 10001     -0.038474      1.207618         0.110827       3  0.020929   \n",
       "198809 10001     -0.038474      1.207618         0.110827       3  0.013954   \n",
       "198810 10001     -0.038474      1.207618         0.110827       3  0.048190   \n",
       "...                    ...           ...              ...     ...       ...   \n",
       "202107 93436      0.519951      0.032812         0.171654       3  0.920721   \n",
       "202108 93436      0.519951      0.032812         0.171654       3  0.915504   \n",
       "202109 93436      0.519951      0.032812         0.171654       3  0.858287   \n",
       "202110 93436      0.519951      0.032812         0.171654       3  0.820107   \n",
       "202111 93436      0.519951      0.032812         0.171654       3  0.941812   \n",
       "\n",
       "                     p1        p2        p3        p4        p5        p6  \\\n",
       "yyyymm permno                                                               \n",
       "198806 10001   0.518488       NaN  0.003213  0.421619  0.127090  0.930636   \n",
       "198807 10001   0.917800       NaN  0.003194  0.424092  0.130391  0.920861   \n",
       "198808 10001   0.906290       NaN  0.003106  0.432743  0.141944  0.911085   \n",
       "198809 10001   0.676218       NaN  0.003203  0.452533  0.140293  0.901310   \n",
       "198810 10001   0.975014       NaN  0.003184  0.415930  0.128740  0.891534   \n",
       "...                 ...       ...       ...       ...       ...       ...   \n",
       "202107 93436   0.706851  0.965588  1.000000  0.461973  0.926307  0.012791   \n",
       "202108 93436   0.732214  0.997125  1.000000  0.439507  0.926086  0.012636   \n",
       "202109 93436   0.642387  0.983239  1.000000  0.433954  0.926528  0.012482   \n",
       "202110 93436   0.632387  0.992073  1.000000  0.433453  0.925645  0.012945   \n",
       "202111 93436   0.942366  0.998905  1.000000  0.673471  0.925865  0.012173   \n",
       "\n",
       "                     p7  \n",
       "yyyymm permno            \n",
       "198806 10001   0.160905  \n",
       "198807 10001   0.159606  \n",
       "198808 10001   0.158307  \n",
       "198809 10001   0.157008  \n",
       "198810 10001   0.155709  \n",
       "...                 ...  \n",
       "202107 93436   0.269733  \n",
       "202108 93436   0.263647  \n",
       "202109 93436   0.275819  \n",
       "202110 93436   0.266690  \n",
       "202111 93436   0.260604  \n",
       "\n",
       "[2374161 rows x 17 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "196312"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "196412"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "196512"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "196612"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "196712"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "196812"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "196912"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197012"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197112"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197212"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197312"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197412"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197512"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197612"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197712"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197812"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "197912"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198012"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198112"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198212"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198312"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198412"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198512"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198612"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198712"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198812"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "198912"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199012"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199112"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199212"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199312"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199412"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199512"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199612"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199712"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199812"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "199912"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200012"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200112"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200212"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200312"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200412"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200512"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200612"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200712"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200812"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "200912"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201012"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201112"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201212"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201312"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201412"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201512"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201612"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201712"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201812"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "201912"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "202012"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "num_clusters = [10, 25, 50, 100]\n",
    "\n",
    "# for the annual characteristics, only loop through Junes\n",
    "charlist = ['p0', 'p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7']\n",
    "\n",
    "date_list = list(df_full.index.get_level_values(level=0).drop_duplicates())\n",
    "date_list.sort()\n",
    "\n",
    "for d in date_list:\n",
    "    \n",
    "    if (d % 100) == 12:\n",
    "        display(d)\n",
    "    \n",
    "    data = df_full.xs(d, level=0).dropna(subset=charlist)\n",
    "    \n",
    "    for k in num_clusters:\n",
    "        \n",
    "        kmeans = KMeans(n_clusters=k, init='k-means++', n_init=100, tol=1e-6, verbose=0).fit(data[charlist])\n",
    "\n",
    "        AggClusters = AgglomerativeClustering(n_clusters=k).fit(data[charlist])\n",
    "\n",
    "        df_full.loc[(d, data.index), 'kmeans' + str(k)] = kmeans.labels_\n",
    "        \n",
    "        df_full.loc[(d, data.index), 'ward' + str(k)] = AggClusters.labels_       "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>marketbeta</th>\n",
       "      <th>idiosyncraticvol</th>\n",
       "      <th>ltrev</th>\n",
       "      <th>size</th>\n",
       "      <th>qmj</th>\n",
       "      <th>assetgrowth</th>\n",
       "      <th>booktomarket</th>\n",
       "      <th>ffprofitability</th>\n",
       "      <th>exchcd</th>\n",
       "      <th>p0</th>\n",
       "      <th>...</th>\n",
       "      <th>p4</th>\n",
       "      <th>p5</th>\n",
       "      <th>p6</th>\n",
       "      <th>p7</th>\n",
       "      <th>ward10</th>\n",
       "      <th>ward25</th>\n",
       "      <th>ward50</th>\n",
       "      <th>ward100</th>\n",
       "      <th>kmeans25</th>\n",
       "      <th>kmeans50</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>permno</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10006</th>\n",
       "      <td>1.037167</td>\n",
       "      <td>0.132136</td>\n",
       "      <td>0.937436</td>\n",
       "      <td>140.759370</td>\n",
       "      <td>-0.107722</td>\n",
       "      <td>0.459617</td>\n",
       "      <td>1.199740</td>\n",
       "      <td>0.160064</td>\n",
       "      <td>1</td>\n",
       "      <td>0.574956</td>\n",
       "      <td>...</td>\n",
       "      <td>0.504394</td>\n",
       "      <td>0.984183</td>\n",
       "      <td>0.750439</td>\n",
       "      <td>0.205624</td>\n",
       "      <td>9.0</td>\n",
       "      <td>9.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>31.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>20.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10014</th>\n",
       "      <td>0.663494</td>\n",
       "      <td>0.462344</td>\n",
       "      <td>0.085000</td>\n",
       "      <td>11.117250</td>\n",
       "      <td>-1.264312</td>\n",
       "      <td>0.424276</td>\n",
       "      <td>1.084354</td>\n",
       "      <td>0.160643</td>\n",
       "      <td>1</td>\n",
       "      <td>0.155203</td>\n",
       "      <td>...</td>\n",
       "      <td>0.173989</td>\n",
       "      <td>0.980668</td>\n",
       "      <td>0.683656</td>\n",
       "      <td>0.209139</td>\n",
       "      <td>8.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>73.0</td>\n",
       "      <td>19.0</td>\n",
       "      <td>42.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10102</th>\n",
       "      <td>1.008943</td>\n",
       "      <td>0.155061</td>\n",
       "      <td>-0.078569</td>\n",
       "      <td>269.448760</td>\n",
       "      <td>-1.593147</td>\n",
       "      <td>0.142290</td>\n",
       "      <td>0.736371</td>\n",
       "      <td>0.234265</td>\n",
       "      <td>1</td>\n",
       "      <td>0.536155</td>\n",
       "      <td>...</td>\n",
       "      <td>0.065026</td>\n",
       "      <td>0.854130</td>\n",
       "      <td>0.495606</td>\n",
       "      <td>0.488576</td>\n",
       "      <td>3.0</td>\n",
       "      <td>16.0</td>\n",
       "      <td>17.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>48.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10137</th>\n",
       "      <td>0.647443</td>\n",
       "      <td>0.114100</td>\n",
       "      <td>0.674959</td>\n",
       "      <td>475.070740</td>\n",
       "      <td>-0.421437</td>\n",
       "      <td>0.060925</td>\n",
       "      <td>0.476324</td>\n",
       "      <td>0.341060</td>\n",
       "      <td>1</td>\n",
       "      <td>0.139330</td>\n",
       "      <td>...</td>\n",
       "      <td>0.402460</td>\n",
       "      <td>0.585237</td>\n",
       "      <td>0.233743</td>\n",
       "      <td>0.820738</td>\n",
       "      <td>0.0</td>\n",
       "      <td>10.0</td>\n",
       "      <td>3.0</td>\n",
       "      <td>86.0</td>\n",
       "      <td>24.0</td>\n",
       "      <td>22.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10145</th>\n",
       "      <td>1.078029</td>\n",
       "      <td>0.158371</td>\n",
       "      <td>-0.027541</td>\n",
       "      <td>1283.601000</td>\n",
       "      <td>0.349623</td>\n",
       "      <td>0.249240</td>\n",
       "      <td>0.544203</td>\n",
       "      <td>0.279307</td>\n",
       "      <td>1</td>\n",
       "      <td>0.619048</td>\n",
       "      <td>...</td>\n",
       "      <td>0.627417</td>\n",
       "      <td>0.943761</td>\n",
       "      <td>0.314587</td>\n",
       "      <td>0.657293</td>\n",
       "      <td>3.0</td>\n",
       "      <td>8.0</td>\n",
       "      <td>30.0</td>\n",
       "      <td>98.0</td>\n",
       "      <td>22.0</td>\n",
       "      <td>17.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27721</th>\n",
       "      <td>1.559775</td>\n",
       "      <td>0.274158</td>\n",
       "      <td>NaN</td>\n",
       "      <td>21.756001</td>\n",
       "      <td>-0.209774</td>\n",
       "      <td>0.029536</td>\n",
       "      <td>0.846687</td>\n",
       "      <td>0.240819</td>\n",
       "      <td>1</td>\n",
       "      <td>0.915344</td>\n",
       "      <td>...</td>\n",
       "      <td>0.469244</td>\n",
       "      <td>0.378735</td>\n",
       "      <td>0.581722</td>\n",
       "      <td>0.509666</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28054</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.238325</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.692001</td>\n",
       "      <td>-0.939257</td>\n",
       "      <td>0.015385</td>\n",
       "      <td>0.668114</td>\n",
       "      <td>0.230043</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.265378</td>\n",
       "      <td>0.268893</td>\n",
       "      <td>0.428822</td>\n",
       "      <td>0.481547</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28142</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.213667</td>\n",
       "      <td>NaN</td>\n",
       "      <td>26.349375</td>\n",
       "      <td>-1.196277</td>\n",
       "      <td>0.102662</td>\n",
       "      <td>1.581869</td>\n",
       "      <td>0.192690</td>\n",
       "      <td>1</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.195079</td>\n",
       "      <td>0.768014</td>\n",
       "      <td>0.876977</td>\n",
       "      <td>0.333919</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29735</th>\n",
       "      <td>NaN</td>\n",
       "      <td>0.831045</td>\n",
       "      <td>NaN</td>\n",
       "      <td>3.470500</td>\n",
       "      <td>0.538608</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>0.514376</td>\n",
       "      <td>0.132042</td>\n",
       "      <td>2</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>0.707432</td>\n",
       "      <td>0.375575</td>\n",
       "      <td>0.204479</td>\n",
       "      <td>0.095878</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>53023</th>\n",
       "      <td>0.980558</td>\n",
       "      <td>0.133151</td>\n",
       "      <td>-0.231552</td>\n",
       "      <td>64.764000</td>\n",
       "      <td>-0.659559</td>\n",
       "      <td>-0.080585</td>\n",
       "      <td>1.769399</td>\n",
       "      <td>0.179813</td>\n",
       "      <td>1</td>\n",
       "      <td>0.499118</td>\n",
       "      <td>...</td>\n",
       "      <td>0.330404</td>\n",
       "      <td>0.029877</td>\n",
       "      <td>0.912127</td>\n",
       "      <td>0.265378</td>\n",
       "      <td>4.0</td>\n",
       "      <td>4.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>59.0</td>\n",
       "      <td>18.0</td>\n",
       "      <td>6.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>570 rows × 23 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        marketbeta  idiosyncraticvol     ltrev         size       qmj  \\\n",
       "permno                                                                  \n",
       "10006     1.037167          0.132136  0.937436   140.759370 -0.107722   \n",
       "10014     0.663494          0.462344  0.085000    11.117250 -1.264312   \n",
       "10102     1.008943          0.155061 -0.078569   269.448760 -1.593147   \n",
       "10137     0.647443          0.114100  0.674959   475.070740 -0.421437   \n",
       "10145     1.078029          0.158371 -0.027541  1283.601000  0.349623   \n",
       "...            ...               ...       ...          ...       ...   \n",
       "27721     1.559775          0.274158       NaN    21.756001 -0.209774   \n",
       "28054          NaN          0.238325       NaN    34.692001 -0.939257   \n",
       "28142          NaN          0.213667       NaN    26.349375 -1.196277   \n",
       "29735          NaN          0.831045       NaN     3.470500  0.538608   \n",
       "53023     0.980558          0.133151 -0.231552    64.764000 -0.659559   \n",
       "\n",
       "        assetgrowth  booktomarket  ffprofitability  exchcd        p0  ...  \\\n",
       "permno                                                                ...   \n",
       "10006      0.459617      1.199740         0.160064       1  0.574956  ...   \n",
       "10014      0.424276      1.084354         0.160643       1  0.155203  ...   \n",
       "10102      0.142290      0.736371         0.234265       1  0.536155  ...   \n",
       "10137      0.060925      0.476324         0.341060       1  0.139330  ...   \n",
       "10145      0.249240      0.544203         0.279307       1  0.619048  ...   \n",
       "...             ...           ...              ...     ...       ...  ...   \n",
       "27721      0.029536      0.846687         0.240819       1  0.915344  ...   \n",
       "28054      0.015385      0.668114         0.230043       1       NaN  ...   \n",
       "28142      0.102662      1.581869         0.192690       1       NaN  ...   \n",
       "29735      0.083333      0.514376         0.132042       2       NaN  ...   \n",
       "53023     -0.080585      1.769399         0.179813       1  0.499118  ...   \n",
       "\n",
       "              p4        p5        p6        p7  ward10  ward25  ward50  \\\n",
       "permno                                                                   \n",
       "10006   0.504394  0.984183  0.750439  0.205624     9.0     9.0    10.0   \n",
       "10014   0.173989  0.980668  0.683656  0.209139     8.0    22.0    22.0   \n",
       "10102   0.065026  0.854130  0.495606  0.488576     3.0    16.0    17.0   \n",
       "10137   0.402460  0.585237  0.233743  0.820738     0.0    10.0     3.0   \n",
       "10145   0.627417  0.943761  0.314587  0.657293     3.0     8.0    30.0   \n",
       "...          ...       ...       ...       ...     ...     ...     ...   \n",
       "27721   0.469244  0.378735  0.581722  0.509666     NaN     NaN     NaN   \n",
       "28054   0.265378  0.268893  0.428822  0.481547     NaN     NaN     NaN   \n",
       "28142   0.195079  0.768014  0.876977  0.333919     NaN     NaN     NaN   \n",
       "29735   0.707432  0.375575  0.204479  0.095878     NaN     NaN     NaN   \n",
       "53023   0.330404  0.029877  0.912127  0.265378     4.0     4.0    25.0   \n",
       "\n",
       "        ward100  kmeans25  kmeans50  \n",
       "permno                               \n",
       "10006      31.0      10.0      20.0  \n",
       "10014      73.0      19.0      42.0  \n",
       "10102      90.0      22.0      48.0  \n",
       "10137      86.0      24.0      22.0  \n",
       "10145      98.0      22.0      17.0  \n",
       "...         ...       ...       ...  \n",
       "27721       NaN       NaN       NaN  \n",
       "28054       NaN       NaN       NaN  \n",
       "28142       NaN       NaN       NaN  \n",
       "29735       NaN       NaN       NaN  \n",
       "53023      59.0      18.0       6.0  \n",
       "\n",
       "[570 rows x 23 columns]"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_full.xs(196307,level=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Write cluster assignments to a csv-file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "#df_full[['kmeans10', 'kmeans25', 'kmeans50', 'kmeans100', 'ward10', 'ward25', 'ward50', 'ward100']].to_csv('~/Dropbox/Research/Factor Momentum/Python/clusters_big.csv')\n",
    "df_full[['kmeans25', 'kmeans50', 'ward10', 'ward25', 'ward50', 'ward100']].to_csv('~/Dropbox/Research/Factor Momentum/Python/clusters_big.csv')\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "        df_small.loc[(d, data.index), 'Ward' + str(k)] = AggClusters.labels_        \n",
    "\n",
    "## Unused snippets of code below for alternative clustering methods and/or the (optimal) choice of the number of clusters in k-means"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import DBSCAN\n",
    "from sklearn.cluster import AffinityPropagation\n",
    "from sklearn.metrics.pairwise import pairwise_distances\n",
    "from sklearn.metrics import silhouette_score\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# using inertia (elbow) or silhuette scores to find the optimal number of clusters?\n",
    "\n",
    "range_n_clusters = list(np.arange(2,40,1))\n",
    "silhouette_avg = []\n",
    "ssq = []\n",
    "\n",
    "for num_clusters in range_n_clusters:\n",
    " \n",
    "    # initialise kmeans\n",
    "    kmeans = KMeans(n_clusters=num_clusters, init='k-means++', tol=1e-6, n_init=20).fit(data[characteristics])\n",
    "    cluster_labels = kmeans.labels_\n",
    "\n",
    "    #silhouette score\n",
    "    silhouette_avg.append(silhouette_score(data[characteristics], cluster_labels))\n",
    "    \n",
    "    #inertia\n",
    "    ssq.append(kmeans.inertia_)\n",
    "\n",
    "# plot either silhouette score *or* ssq\n",
    "\n",
    "#plt.plot(range_n_clusters,silhouette_avg,'bx-')\n",
    "plt.plot(range_n_clusters,ssq,'bx-')\n",
    "plt.xlabel('Values of K') \n",
    "plt.ylabel('Silhouette score') \n",
    "plt.title('Silhouette analysis For Optimal k')\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Affinity Propagation\n",
    "\n",
    "afprop = AffinityPropagation(max_iter=2000, random_state=None).fit(data[characteristics])\n",
    "cluster_data.loc[(d, data.index), 'afprop'] = afprop.labels_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# DBSCAN test\n",
    "\n",
    "# define the list of maximum distances fir DBSCAN\n",
    "eps_list = list(np.arange(0.001, 0.01, 0.001)) + list(np.arange(0.01, 0.2, 0.005))\n",
    "\n",
    "data = cluster_data.xs(196306, level=0)\n",
    "\n",
    "# minimize Herfindahl index\n",
    "\n",
    "mc = {'hf': 10000, 'cluster_data': []}\n",
    "\n",
    "hf_list = []\n",
    "\n",
    "for e in eps_list:\n",
    "\n",
    "    dbs = DBSCAN(eps=e, min_samples=5).fit(data[characteristics])\n",
    "\n",
    "    N = pd.DataFrame(dbs.labels_).value_counts()\n",
    "    hf = ((N/N.sum())**2).sum() / len(np.unique(dbs.labels_))\n",
    "\n",
    "    hf_list.append(hf)\n",
    "\n",
    "    if hf < mc['hf']:\n",
    "        mc['hf'] = hf\n",
    "        mc['cluster_data'] = dbs\n",
    "\n",
    "pd.DataFrame(hf_list).plot()\n",
    "#cluster_data.loc[(d, data.index), 'DBSCAN'] = mc['cluster_data'].labels_"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
